Chapter 5 Community composition
5.1 Count data preparation
genome_counts_log <- genome_counts %>%
column_to_rownames(var="genome") %>%
mutate_all(~log10(.+1)) #fixed: mutate_at(vars(), ~log10(.+1))) was not working
genome_counts_pivot <- genome_counts %>%
mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append taxonomy
mutate(phylum = fct_relevel(phylum, rev(ehi_phylum_colors$phylum))) #sort phyla by taxonomy
genome_counts_by_host <- sample_metadata %>%
select("sample","species","area_type", "development") %>%
rename(host_sp=species) %>%
left_join(genome_counts_pivot,., by=join_by("sample" == "sample")) #%>%
#mutate(sample=factor(sample, levels = sample_sort)) #alternative to join: sorting by area_type
# Retrieve taxonomy colors to use standardised EHI colors
phylum_colors <- ehi_phylum_colors %>%
filter(phylum %in% unique(genome_counts_by_host$phylum)) %>%
select(colors) %>%
pull() %>%
rev()
phylum_colors <- c(phylum_colors,"#cccccc") #REMOVE! ONLY FOR ARCHAEANS
# Which host species each genome can be found in
genomes_by_species <- genome_counts_by_host %>%
filter(count>0) %>%
group_by(genome) %>%
mutate(host = if_else(all(host_sp == "Sciurus vulgaris"), "only red",
if_else(all(host_sp == "Sciurus carolinensis"), "only grey", "both"))) %>%
select(genome, host) %>%
distinct(genome, .keep_all = TRUE) %>%
left_join(.,genome_metadata, by='genome')
genomes_by_species$host <-factor(genomes_by_species$host, levels = c("both", "only red", "only grey"))5.2 Genomes by host species
# Which phylum the MAG belongs to
phyla <- ehi_phylum_colors %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(phylum, colors) %>%
unique()
# Generate the phylum color heatmap
phylum_heatmap <- ehi_phylum_colors %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(genome,phylum) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome")
# Create baseline circular genome tree
circular_tree <- force.ultrametric(genome_tree,method="extend") %>%
ggtree(., layout = 'circular', size = 0.1, angle=45) +
xlim(-1, NA)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum colors ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.0, width=0.2, colnames=FALSE) +
scale_fill_manual(values=phylum_colors, name="Phylum") +
#geom_tiplab2(size=1, hjust=-0.1) +
theme(legend.position = "right", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
#Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()
# Add host ring
circular_tree_h <- circular_tree +
new_scale_fill() +
scale_fill_manual(values = c("black", "#ed2939", "#92a0ad"), name="Host\nspecies") + #"#cc3333", "#999999"
geom_fruit(
data=genomes_by_species,
geom=geom_tile,
mapping = aes(y=genome, fill=host),
offset = 0.35,
width=0.2)
#Plot circular tree
circular_tree_h#number of MAGs by host species
genomes_by_species %>%
dplyr::group_by(host) %>%
summarise(n=length(host),
percentage=(length(host)/1687)*100) %>%
tt()| host | n | percentage |
|---|---|---|
| both | 505 | 29.93480 |
| only red | 482 | 28.57143 |
| only grey | 700 | 41.49378 |
#most abundant phyla by host species
genome_counts_by_host %>%
#filter(count>0) %>%
group_by(host_sp,phylum) %>%
summarise(rel_abundance=sum(count)) %>%
top_n(3, rel_abundance) %>%
arrange(by_group=host_sp, desc(rel_abundance)) %>%
tt()| host_sp | phylum | rel_abundance |
|---|---|---|
| Sciurus carolinensis | p__Bacillota_A | 55.472233 |
| Sciurus carolinensis | p__Bacteroidota | 16.149398 |
| Sciurus carolinensis | p__Bacillota | 2.498674 |
| Sciurus vulgaris | p__Bacillota_A | 48.125675 |
| Sciurus vulgaris | p__Bacteroidota | 32.438965 |
| Sciurus vulgaris | p__Actinomycetota | 16.517267 |
#most common phyla by host species
genome_counts_by_host %>%
#filter(count>0) %>%
group_by(host_sp,phylum) %>%
summarise(freq=sum(count>0)/length(host_sp)) %>%
top_n(3, freq) %>%
arrange(by_group=host_sp, desc(freq)) %>%
tt()| host_sp | phylum | freq |
|---|---|---|
| Sciurus carolinensis | p__Campylobacterota | 0.46250000 |
| Sciurus carolinensis | p__Verrucomicrobiota | 0.30000000 |
| Sciurus carolinensis | p__Bacillota_B | 0.27291667 |
| Sciurus vulgaris | p__Campylobacterota | 0.14545455 |
| Sciurus vulgaris | p__Bacillota_C | 0.13522727 |
| Sciurus vulgaris | p__Bacillota_B | 0.08333333 |
#most abundant class by host species
genome_counts_by_host %>%
filter(count>0) %>%
group_by(host_sp,class) %>%
summarise(total_count=sum(count)) %>%
top_n(5, total_count) %>%
arrange(by_group=host_sp, desc(total_count)) %>%
tt()| host_sp | class | total_count |
|---|---|---|
| Sciurus carolinensis | c__Clostridia | 55.472233 |
| Sciurus carolinensis | c__Bacteroidia | 16.149398 |
| Sciurus carolinensis | c__Bacilli | 2.498674 |
| Sciurus carolinensis | c__Verrucomicrobiae | 2.094198 |
| Sciurus carolinensis | c__Vampirovibrionia | 1.253040 |
| Sciurus vulgaris | c__Clostridia | 48.125675 |
| Sciurus vulgaris | c__Bacteroidia | 32.438965 |
| Sciurus vulgaris | c__Actinomycetia | 16.040176 |
| Sciurus vulgaris | c__Bacilli | 8.233587 |
| Sciurus vulgaris | c__Gammaproteobacteria | 2.254878 |
#most abundant family by host species
genome_counts_by_host %>%
filter(count>0) %>%
group_by(host_sp,family) %>%
summarise(abundance=sum(count)) %>%
top_n(5, abundance) %>%
arrange(by_group=host_sp, desc(abundance)) %>%
tt()| host_sp | family | abundance |
|---|---|---|
| Sciurus carolinensis | f__Lachnospiraceae | 29.253695 |
| Sciurus carolinensis | f__Oscillospiraceae | 16.103469 |
| Sciurus carolinensis | f__Bacteroidaceae | 9.370700 |
| Sciurus carolinensis | f__Muribaculaceae | 4.737998 |
| Sciurus carolinensis | f__Borkfalkiaceae | 2.527615 |
| Sciurus vulgaris | f__Lachnospiraceae | 35.793100 |
| Sciurus vulgaris | f__Bacteroidaceae | 21.149209 |
| Sciurus vulgaris | f__Bifidobacteriaceae | 15.986288 |
| Sciurus vulgaris | f__Muribaculaceae | 9.598616 |
| Sciurus vulgaris | f__Oscillospiraceae | 4.267142 |
***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
Trovata più di una classe "phylo" in cache; si utilizza il primo, dal namespace 'phyloseq'
Definito anche da 'tidytree'
#Add phylum colors
vertical_tree <- gheatmap(vertical_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE, color=NA) +
scale_fill_manual(values=phylum_colors)Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
#Reset fill scale
vertical_tree <- vertical_tree + new_scale_fill()
#Add counts
vertical_tree <- gheatmap(vertical_tree, genome_counts_log, offset=0.5, width=3.5, color=NA, colnames=FALSE) + #, colnames_angle=90, font.size=2, colnames_position="top", colnames_offset_y = 9
vexpand(.08) +
coord_cartesian(clip = "off") +
scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white")Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
5.3 Taxonomic composition of samples
# sample_sort <- sample_metadata %>%
# arrange(Area_type) %>%
# select(sample) %>%
# pull()
# Plot stacked barplot
ggplot(genome_counts_by_host, aes(x=sample,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
geom_bar(stat="identity", colour="white", linewidth=0.02)+ #plot stacked bars with white borders
scale_fill_manual(values=phylum_colors, name="Phylum") +
labs(y = "Relative abundance") +
guides(fill = guide_legend(ncol = 1)) +
facet_nested(~host_sp, scales="free", space="free") +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
legend.position="right",
)phylum_summary <- genome_counts_by_host %>%
group_by(sample,host_sp,phylum) %>%
summarise(relabun=sum(count))
phylum_summary %>%
group_by(phylum) %>%
summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) # A tibble: 13 × 3
phylum mean sd
<fct> <dbl> <dbl>
1 p__Elusimicrobiota 0.00170 0.00694
2 p__Cyanobacteriota 0.00895 0.0118
3 p__Bacillota 0.0565 0.110
4 p__Bacillota_B 0.00255 0.00418
5 p__Bacillota_C 0.00429 0.00664
6 p__Bacillota_A 0.545 0.252
7 p__Actinomycetota 0.0898 0.234
8 p__Patescibacteria 0.0000251 0.000346
9 p__Pseudomonadota 0.0180 0.0738
10 p__Campylobacterota 0.00521 0.0250
11 p__Bacteroidota 0.256 0.178
12 p__Verrucomicrobiota 0.0116 0.0229
13 p__Thermoplasmatota 0.000351 0.00366
phylum_arrange <- phylum_summary %>%
group_by(phylum) %>%
summarise(mean=mean(relabun)) %>%
arrange(-mean) %>%
select(phylum) %>%
pull()
phylum_summary %>%
filter(phylum %in% phylum_arrange) %>%
mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
scale_color_manual(values=phylum_colors[rev(phylum_arrange)], name="Phylum") +
geom_jitter(alpha=0.5) +
facet_nested(~host_sp, scales="free", space="free") +
theme_minimal() +
theme(legend.position="right") +
labs(y="Phylum",x="Relative abundance")family_summary <- genome_counts_by_host %>%
group_by(sample,host_sp,family) %>%
summarise(relabun=sum(count))
family_summary %>%
group_by(family) %>%
summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) %>%
arrange(-mean)# A tibble: 74 × 3
family mean sd
<chr> <dbl> <dbl>
1 f__Lachnospiraceae 0.342 0.192
2 f__Bacteroidaceae 0.161 0.146
3 f__Oscillospiraceae 0.107 0.104
4 f__Bifidobacteriaceae 0.0841 0.233
5 f__Muribaculaceae 0.0755 0.0956
6 f__Ruminococcaceae 0.0329 0.0377
7 f__Borkfalkiaceae 0.0192 0.0270
8 f__Lactobacillaceae 0.0163 0.0514
9 f__Streptococcaceae 0.0161 0.0866
10 f__Acutalibacteraceae 0.0109 0.0135
# ℹ 64 more rows
family_arrange <- family_summary %>%
group_by(family) %>%
summarise(mean=sum(relabun)) %>%
arrange(-mean) %>%
select(family) %>%
pull()
family_summary %>%
left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
left_join(sample_metadata,by=join_by(sample==sample)) %>%
filter(family %in% family_arrange[1:20]) %>%
mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
scale_color_manual(values=phylum_colors) +
geom_jitter(alpha=0.5) +
facet_grid(.~host_sp)+
theme_minimal() +
labs(y="Family", x="Relative abundance", color="Phylum")genus_summary <- genome_counts_by_host %>%
group_by(sample,host_sp,genus) %>%
summarise(relabun=sum(count))
genus_summary %>%
group_by(genus) %>%
summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) %>%
arrange(-mean)# A tibble: 348 × 3
genus mean sd
<chr> <dbl> <dbl>
1 g__Bifidobacterium 0.0841 0.233
2 g__Prevotella 0.0732 0.0968
3 g__Acetatifactor 0.0429 0.0621
4 g__ 0.0383 0.0427
5 g__Eisenbergiella 0.0322 0.0556
6 g__Faecousia 0.0270 0.0366
7 g__CAG-95 0.0219 0.0326
8 g__Roseburia 0.0215 0.0526
9 g__Phocaeicola 0.0194 0.0292
10 g__Dysosmobacter 0.0179 0.0175
# ℹ 338 more rows
genus_arrange <- genus_summary %>%
group_by(genus) %>%
summarise(mean=sum(relabun)) %>%
arrange(-mean) %>%
select(genus) %>%
pull()
genus_summary %>%
left_join(genome_metadata %>% select(genus,phylum) %>% unique(),by=join_by(genus==genus)) %>%
left_join(sample_metadata,by=join_by(sample==sample)) %>%
filter(genus %in% genus_arrange[1:20]) %>%
mutate(genus=factor(genus,levels=rev(genus_arrange[1:20]))) %>%
filter(relabun > 0) %>%
ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
scale_color_manual(values=phylum_colors) +
geom_jitter(alpha=0.5) +
facet_grid(.~host_sp)+
theme_minimal() +
labs(y="Genus", x="Relative abundance", color="Phylum")Warning in left_join(., genome_metadata %>% select(genus, phylum) %>% unique(), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 41 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.